notebooks/Common Crawl.ipynb

{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "8241443f-2ac6-4e3a-b6a9-a4d7868ed627", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import json\n", "import os\n", "import pandas as pd\n", "# For parsing URLs:\n", "from urllib.parse import quote_plus\n", "import nltk\n", "from langdetect import detect\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a2ae13aa-fe51-4c9e-aaf7-0f9eae375a82", "metadata": {}, "outputs": [], "source": [ "def search_cc_index(url, index_name):\n", " \"\"\"\n", " Search the Common Crawl Index for a given URL.\n", " \n", " This function queries the Common Crawl Index <a href=\"https://www.jcchouinard.com/api/\">API</a> to find records related to the specified URL. \n", " It uses the index specified by `index_name` to retrieve the data and returns a list of JSON objects, \n", " each representing a record from the index.\n", " \n", " Arguments:\n", " url (str): The URL to search for in the Common Crawl Index.\n", " index_name (str): The name of the Common Crawl Index to search (e.g., \"CC-MAIN-2024-10\").\n", " \n", " Returns:\n", " list: A list of JSON objects representing records found in the Common Crawl Index. \n", " Returns None if the request fails or no records are found.\n", " \n", " Example:\n", " >>> search_cc_index(\"example.com\", \"CC-MAIN-2024-10\")\n", " [{...}, {...}, ...]\n", " \"\"\"\n", " encoded_url = quote_plus(url)\n", " index_url = f'http://index.commoncrawl.org/{index_name}-index?url={encoded_url}&output=json'\n", " response = requests.get(index_url)\n", " \n", " if response.status_code == 200:\n", " records = response.text.strip().split('\\n')\n", " return [json.loads(record) for record in records]\n", " else:\n", " return None\n", " " ] }, { "cell_type": "code", "execution_count": 6, "id": "2a67b1b1-f49b-4bf1-8f09-7eb297bfa552", "metadata": {}, "outputs": [], "source": [ "from warcio.archiveiterator import ArchiveIterator\n", "from bs4 import BeautifulSoup\n", "import sys\n", "import nltk\n", "from langdetect import detect\n", "import re\n", "import pandas as pd\n", "\n", "# Regular expression to detect non-Latin characters\n", "non_latin_pattern = re.compile(r'[^\\x00-\\x7F]+')\n", "\n", "allowed_domains = {'com', 'gov', 'edu', 'co', 'uk', 'net', 'mil', 'ai', 'ca'}\n", "\n", "def is_english(text):\n", " try:\n", " return detect(text) == 'en' and not non_latin_pattern.search(text)\n", " except:\n", " return False\n", "\n", "def is_latin_not_english(text):\n", " try:\n", " return detect(text) != 'en' and not non_latin_pattern.search(text)\n", " except:\n", " return False\n", "\n", "\n", "def get_last_domain_part(url:str):\n", " return url.split(\"/\")[2].split(\".\")[-1]\n", "\n", "def is_error_response(input:str):\n", " block_words = {\"404\"}\n", " input = input.lower()\n", " words = input.split()\n", " for word in words:\n", " if word in block_words:\n", " return True\n", " if input.find(\"no response\") >=0:\n", " return True\n", " if input.find(\"not found\") >=0:\n", " return True\n", " return False\n", " \n", "\n", "def extract_english_files(warc_file):\n", " \"\"\"\n", " Returns a list of dictionaries with url description and title keys from \n", " english web pges in a WARC file\n", " \"\"\"\n", " count = 0\n", " results = []\n", " with open(warc_file, 'rb') as stream:\n", " for record in ArchiveIterator(stream):\n", " if record.rec_type == 'response' and 'text/html' in record.http_headers.get('Content-Type', ''):\n", " payload = record.content_stream().read()\n", " soup = BeautifulSoup(payload, 'html.parser')\n", " \n", " html_tag = soup.find('html')\n", " if html_tag and html_tag.get('lang', '').startswith('en'):\n", " url = record.rec_headers.get('WARC-Target-URI')\n", " if not get_last_domain_part(url) in allowed_domains:\n", " continue\n", " title_tag = soup.find('title')\n", " title = title_tag.text.strip() if title_tag else None\n", " if title is None or is_error_response(title):\n", " continue\n", " og_desc_tag = soup.find('meta', attrs={'property': 'og:description'})\n", " description = og_desc_tag.get('content', '').strip() if og_desc_tag else None\n", " if description is None:\n", " meta_desc_tag = soup.find('meta', attrs={'name': 'description'})\n", " description = meta_desc_tag.get('content', '').strip() if meta_desc_tag else 'No Description'\n", " \n", " if not is_english(title):\n", " continue\n", " if count%20 == 0:\n", " print(count)\n", " count += 1\n", " results.append({\"url\": url, \"description\": description, \"title\": title})\n", " return results\n", "\n", "def extract_non_english_latin(warc_file):\n", " \"\"\"\n", " Extracts files that have latin charsets in the title but a language detector determines as non-english\n", " This is good for exracting error pages, pages in non-english languages\n", " \"\"\"\n", " count = 0\n", " results = []\n", " with open(warc_file, 'rb') as stream:\n", " for record in ArchiveIterator(stream):\n", " if record.rec_type == 'response' and 'text/html' in record.http_headers.get('Content-Type', ''):\n", " payload = record.content_stream().read()\n", " soup = BeautifulSoup(payload, 'html.parser')\n", " \n", " html_tag = soup.find('html')\n", " if html_tag and not html_tag.get('lang', '').startswith('en'):\n", " url = record.rec_headers.get('WARC-Target-URI')\n", " if not get_last_domain_part(url) in allowed_domains:\n", " continue\n", " title_tag = soup.find('title')\n", " title = title_tag.text.strip() if title_tag else None\n", " if title is None or is_error_response(title):\n", " continue\n", " og_desc_tag = soup.find('meta', attrs={'property': 'og:description'})\n", " description = og_desc_tag.get('content', '').strip() if og_desc_tag else None\n", " if description is None:\n", " meta_desc_tag = soup.find('meta', attrs={'name': 'description'})\n", " description = meta_desc_tag.get('content', '').strip() if meta_desc_tag else 'No Description'\n", " \n", " if not is_latin_not_english(title):\n", " continue\n", " if count%20 == 0:\n", " print(count)\n", " count += 1\n", " if count > 800:\n", " break\n", " results.append({\"url\": url, \"description\": description, \"title\": title})\n", " return results \n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "13c9fa44-0352-412b-9f91-6a1172fd40ca", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/Rrando/Documents/GitHub/smart-tab-grouping/notebooks\n" ] } ], "source": [ "!pwd" ] }, { "cell_type": "code", "execution_count": 8, "id": "977d158a-acdb-4e91-8edc-e9dd7198d9bf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "40\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "60\n", "80\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_10927/2355896864.py:55: XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.\n", "\n", "Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n", "\n", "If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that, run this code before calling the BeautifulSoup constructor:\n", "\n", " from bs4 import XMLParsedAsHTMLWarning\n", " import warnings\n", "\n", " warnings.filterwarnings(\"ignore\", category=XMLParsedAsHTMLWarning)\n", "\n", " soup = BeautifulSoup(payload, 'html.parser')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "100\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "140\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "160\n", "180\n", "200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "220\n", "240\n", "260\n", "280\n", "300\n", "320\n", "340\n", "360\n", "380\n", "400\n", "420\n", "440\n", "460\n", "480\n", "500\n", "520\n", "540\n", "560\n", "580\n", "600\n", "620\n", "640\n", "660\n", "680\n", "700\n", "720\n", "740\n", "760\n", "780\n", "800\n", "820\n", "840\n", "860\n", "880\n", "900\n", "920\n", "940\n", "960\n", "980\n", "1000\n", "1020\n", "1040\n", "1060\n", "1080\n", "1100\n", "1120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1140\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1160\n", "1180\n", "1200\n", "1220\n", "1240\n", "1260\n", "1280\n", "1300\n", "1320\n", "1340\n", "1360\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1380\n", "1400\n", "1420\n", "1440\n", "1460\n", "1480\n", "1500\n", "1520\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1540\n", "1560\n", "1580\n", "1600\n", "1620\n", "1640\n", "1660\n", "1680\n", "1700\n", "1720\n", "1740\n", "1760\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1780\n", "1800\n", "1820\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1840\n", "1860\n", "1880\n", "1900\n", "1920\n", "1940\n", "1960\n", "1980\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2000\n", "2020\n", "2040\n", "2060\n", "2080\n", "2100\n", "2120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2140\n", "2160\n", "2180\n", "2200\n", "2220\n", "2240\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2260\n", "2280\n", "2300\n", "2320\n", "2340\n", "2360\n", "2380\n", "2400\n", "2420\n", "2440\n", "2460\n", "2480\n", "2500\n", "2520\n", "2540\n", "2560\n", "2580\n", "2600\n", "2620\n", "2640\n", "2660\n", "2680\n", "2700\n", "2720\n", "2740\n", "2760\n", "2780\n", "2800\n", "2820\n", "2840\n", "2860\n", "2880\n", "2900\n", "2920\n", "2940\n", "2960\n", "2980\n", "3000\n", "3020\n", "3040\n", "3060\n", "3080\n", "3100\n", "3120\n", "3140\n", "3160\n", "3180\n", "3200\n", "3220\n", "3240\n", "3260\n", "3280\n", "3300\n", "3320\n", "3340\n", "3360\n", "3380\n", "3400\n", "3420\n", "3440\n", "3460\n", "3480\n", "3500\n", "3520\n", "3540\n", "3560\n", "3580\n", "3600\n", "3620\n", "3640\n", "3660\n", "3680\n", "3700\n", "3720\n", "3740\n", "3760\n", "3780\n", "3800\n", "3820\n", "3840\n", "3860\n", "3880\n", "3900\n", "3920\n", "3940\n", "3960\n", "3980\n", "4000\n", "4020\n", "4040\n", "4060\n", "4080\n", "4100\n", "4120\n", "4140\n", "4160\n", "4180\n", "4200\n", "4220\n", "4240\n", "4260\n", "4280\n", "4300\n", "4320\n", "4340\n", "4360\n", "4380\n", "4400\n", "4420\n", "4440\n", "4460\n", "4480\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4500\n", "4520\n", "4540\n", "4560\n", "4580\n", "4600\n", "4620\n", "4640\n", "4660\n", "4680\n", "4700\n", "4720\n", "4740\n", "4760\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4780\n", "4800\n", "4820\n", "0\n", "20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "40\n", "60\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "80\n", "100\n", "120\n", "140\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "160\n", "180\n", "200\n", "220\n", "240\n", "260\n", "280\n", "300\n", "320\n", "340\n", "360\n", "380\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "400\n", "420\n", "440\n", "460\n", "480\n", "500\n", "520\n", "540\n", "560\n", "580\n", "600\n", "620\n", "640\n", "660\n", "680\n", "700\n", "720\n", "740\n", "760\n", "780\n", "800\n", "820\n", "840\n", "860\n", "880\n", "900\n", "920\n", "940\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "960\n", "980\n", "1000\n", "1020\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1040\n", "1060\n", "1080\n", "1100\n", "1120\n", "1140\n", "1160\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1180\n", "1200\n", "1220\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1240\n", "1260\n", "1280\n", "1300\n", "1320\n", "1340\n", "1360\n", "1380\n", "1400\n", "1420\n", "1440\n", "1460\n", "1480\n", "1500\n", "1520\n", "1540\n", "1560\n", "1580\n", "1600\n", "1620\n", "1640\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1660\n", "1680\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1700\n", "1720\n", "1740\n", "1760\n", "1780\n", "1800\n", "1820\n", "1840\n", "1860\n", "1880\n", "1900\n", "1920\n", "1940\n", "1960\n", "1980\n", "2000\n", "2020\n", "2040\n", "2060\n", "2080\n", "2100\n", "2120\n", "2140\n", "2160\n", "2180\n", "2200\n", "2220\n", "2240\n", "2260\n", "2280\n", "2300\n", "2320\n", "2340\n", "2360\n", "2380\n", "2400\n", "2420\n", "2440\n", "2460\n", "2480\n", "2500\n", "2520\n", "2540\n", "2560\n", "2580\n", "2600\n", "2620\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2640\n", "2660\n", "2680\n", "2700\n", "2720\n", "2740\n", "2760\n", "2780\n", "2800\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2820\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2840\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2860\n", "2880\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2900\n", "2920\n", "2940\n", "2960\n", "2980\n", "3000\n", "3020\n", "3040\n", "3060\n", "3080\n", "3100\n", "3120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3140\n", "3160\n", "3180\n", "3200\n", "3220\n", "3240\n", "3260\n", "3280\n", "3300\n", "3320\n", "3340\n", "3360\n", "3380\n", "3400\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3420\n", "3440\n", "3460\n", "3480\n", "3500\n", "3520\n", "3540\n", "3560\n", "3580\n", "3600\n", "3620\n", "3640\n", "3660\n", "3680\n", "3700\n", "3720\n", "3740\n", "3760\n", "3780\n", "3800\n", "3820\n", "3840\n", "3860\n", "3880\n", "3900\n", "3920\n", "3940\n", "3960\n", "3980\n", "4000\n", "4020\n", "4040\n", "4060\n", "4080\n", "4100\n", "4120\n", "4140\n", "4160\n", "4180\n", "4200\n", "4220\n", "4240\n", "4260\n", "4280\n", "4300\n", "4320\n", "4340\n", "4360\n", "4380\n", "4400\n", "4420\n", "4440\n", "4460\n", "4480\n", "4500\n", "4520\n", "4540\n", "4560\n", "4580\n", "4600\n", "4620\n", "4640\n", "4660\n", "4680\n", "4700\n", "4720\n", "4740\n", "4760\n", "4780\n", "4800\n", "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "20\n", "40\n", "60\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "80\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "100\n", "120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "140\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "160\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "180\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "220\n", "240\n", "260\n", "280\n", "300\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "320\n", "340\n", "360\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "380\n", "400\n", "420\n", "440\n", "460\n", "480\n", "500\n", "520\n", "540\n", "560\n", "580\n", "600\n", "620\n", "640\n", "660\n", "680\n", "700\n", "720\n", "740\n", "760\n", "780\n", "800\n", "820\n", "840\n", "860\n", "880\n", "900\n", "920\n", "940\n", "960\n", "980\n", "1000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1020\n", "1040\n", "1060\n", "1080\n", "1100\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1120\n", "1140\n", "1160\n", "1180\n", "1200\n", "1220\n", "1240\n", "1260\n", "1280\n", "1300\n", "1320\n", "1340\n", "1360\n", "1380\n", "1400\n", "1420\n", "1440\n", "1460\n", "1480\n", "1500\n", "1520\n", "1540\n", "1560\n", "1580\n", "1600\n", "1620\n", "1640\n", "1660\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1680\n", "1700\n", "1720\n", "1740\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1760\n", "1780\n", "1800\n", "1820\n", "1840\n", "1860\n", "1880\n", "1900\n", "1920\n", "1940\n", "1960\n", "1980\n", "2000\n", "2020\n", "2040\n", "2060\n", "2080\n", "2100\n", "2120\n", "2140\n", "2160\n", "2180\n", "2200\n", "2220\n", "2240\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2260\n", "2280\n", "2300\n", "2320\n", "2340\n", "2360\n", "2380\n", "2400\n", "2420\n", "2440\n", "2460\n", "2480\n", "2500\n", "2520\n", "2540\n", "2560\n", "2580\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2600\n", "2620\n", "2640\n", "2660\n", "2680\n", "2700\n", "2720\n", "2740\n", "2760\n", "2780\n", "2800\n", "2820\n", "2840\n", "2860\n", "2880\n", "2900\n", "2920\n", "2940\n", "2960\n", "2980\n", "3000\n", "3020\n", "3040\n", "3060\n", "3080\n", "3100\n", "3120\n", "3140\n", "3160\n", "3180\n", "3200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3220\n", "3240\n", "3260\n", "3280\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3300\n", "3320\n", "3340\n", "3360\n", "3380\n", "3400\n", "3420\n", "3440\n", "3460\n", "3480\n", "3500\n", "3520\n", "3540\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3560\n", "3580\n", "3600\n", "3620\n", "3640\n", "3660\n", "3680\n", "3700\n", "3720\n", "3740\n", "3760\n", "3780\n", "3800\n", "3820\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3840\n", "3860\n", "3880\n", "3900\n", "3920\n", "3940\n", "3960\n", "3980\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4000\n", "4020\n", "4040\n", "4060\n", "4080\n", "4100\n", "4120\n", "4140\n", "4160\n", "4180\n", "4200\n", "4220\n", "4240\n", "4260\n", "4280\n", "4300\n", "4320\n", "4340\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4360\n", "4380\n", "4400\n", "4420\n", "4440\n", "4460\n", "4480\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4500\n", "4520\n", "4540\n", "4560\n", "4580\n", "4600\n", "4620\n", "4640\n", "4660\n", "4680\n", "4700\n", "4720\n", "4740\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4760\n", "4780\n", "4800\n", "4820\n", "4840\n", "4860\n", "4880\n", "4900\n", "0\n", "20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "40\n", "60\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "80\n", "100\n", "120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "140\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "160\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "180\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "220\n", "240\n", "260\n", "280\n", "300\n", "320\n", "340\n", "360\n", "380\n", "400\n", "420\n", "440\n", "460\n", "480\n", "500\n", "520\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "540\n", "560\n", "580\n", "600\n", "620\n", "640\n", "660\n", "680\n", "700\n", "720\n", "740\n", "760\n", "780\n", "800\n", "820\n", "840\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "860\n", "880\n", "900\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "920\n", "940\n", "960\n", "980\n", "1000\n", "1020\n", "1040\n", "1060\n", "1080\n", "1100\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1120\n", "1140\n", "1160\n", "1180\n", "1200\n", "1220\n", "1240\n", "1260\n", "1280\n", "1300\n", "1320\n", "1340\n", "1360\n", "1380\n", "1400\n", "1420\n", "1440\n", "1460\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1480\n", "1500\n", "1520\n", "1540\n", "1560\n", "1580\n", "1600\n", "1620\n", "1640\n", "1660\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1680\n", "1700\n", "1720\n", "1740\n", "1760\n", "1780\n", "1800\n", "1820\n", "1840\n", "1860\n", "1880\n", "1900\n", "1920\n", "1940\n", "1960\n", "1980\n", "2000\n", "2020\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2040\n", "2060\n", "2080\n", "2100\n", "2120\n", "2140\n", "2160\n", "2180\n", "2200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2220\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2240\n", "2260\n", "2280\n", "2300\n", "2320\n", "2340\n", "2360\n", "2380\n", "2400\n", "2420\n", "2440\n", "2460\n", "2480\n", "2500\n", "2520\n", "2540\n", "2560\n", "2580\n", "2600\n", "2620\n", "2640\n", "2660\n", "2680\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2700\n", "2720\n", "2740\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2760\n", "2780\n", "2800\n", "2820\n", "2840\n", "2860\n", "2880\n", "2900\n", "2920\n", "2940\n", "2960\n", "2980\n", "3000\n", "3020\n", "3040\n", "3060\n", "3080\n", "3100\n", "3120\n", "3140\n", "3160\n", "3180\n", "3200\n", "3220\n", "3240\n", "3260\n", "3280\n", "3300\n", "3320\n", "3340\n", "3360\n", "3380\n", "3400\n", "3420\n", "3440\n", "3460\n", "3480\n", "3500\n", "3520\n", "3540\n", "3560\n", "3580\n", "3600\n", "3620\n", "3640\n", "3660\n", "3680\n", "3700\n", "3720\n", "3740\n", "3760\n", "3780\n", "3800\n", "3820\n", "3840\n", "3860\n", "3880\n", "3900\n", "3920\n", "3940\n", "3960\n", "3980\n", "4000\n", "4020\n", "4040\n", "4060\n", "4080\n", "4100\n", "4120\n", "4140\n", "4160\n", "4180\n", "4200\n", "4220\n", "4240\n", "4260\n", "4280\n", "4300\n", "4320\n", "4340\n", "4360\n", "4380\n", "4400\n", "4420\n", "4440\n", "4460\n", "4480\n", "4500\n", "4520\n", "4540\n", "4560\n", "4580\n", "4600\n", "4620\n", "4640\n", "4660\n", "4680\n", "4700\n", "4720\n", "4740\n", "4760\n", "4780\n", "4800\n", "4820\n", "4840\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4860\n", "4880\n", "4900\n", "4920\n", "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "20\n", "40\n", "60\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "80\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "100\n", "120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "140\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "160\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "180\n", "200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "220\n", "240\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "260\n", "280\n", "300\n", "320\n", "340\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "360\n", "380\n", "400\n", "420\n", "440\n", "460\n", "480\n", "500\n", "520\n", "540\n", "560\n", "580\n", "600\n", "620\n", "640\n", "660\n", "680\n", "700\n", "720\n", "740\n", "760\n", "780\n", "800\n", "820\n", "840\n", "860\n", "880\n", "900\n", "920\n", "940\n", "960\n", "980\n", "1000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1020\n", "1040\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1060\n", "1080\n", "1100\n", "1120\n", "1140\n", "1160\n", "1180\n", "1200\n", "1220\n", "1240\n", "1260\n", "1280\n", "1300\n", "1320\n", "1340\n", "1360\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1380\n", "1400\n", "1420\n", "1440\n", "1460\n", "1480\n", "1500\n", "1520\n", "1540\n", "1560\n", "1580\n", "1600\n", "1620\n", "1640\n", "1660\n", "1680\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1700\n", "1720\n", "1740\n", "1760\n", "1780\n", "1800\n", "1820\n", "1840\n", "1860\n", "1880\n", "1900\n", "1920\n", "1940\n", "1960\n", "1980\n", "2000\n", "2020\n", "2040\n", "2060\n", "2080\n", "2100\n", "2120\n", "2140\n", "2160\n", "2180\n", "2200\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2220\n", "2240\n", "2260\n", "2280\n", "2300\n", "2320\n", "2340\n", "2360\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2380\n", "2400\n", "2420\n", "2440\n", "2460\n", "2480\n", "2500\n", "2520\n", "2540\n", "2560\n", "2580\n", "2600\n", "2620\n", "2640\n", "2660\n", "2680\n", "2700\n", "2720\n", "2740\n", "2760\n", "2780\n", "2800\n", "2820\n", "2840\n", "2860\n", "2880\n", "2900\n", "2920\n", "2940\n", "2960\n", "2980\n", "3000\n", "3020\n", "3040\n", "3060\n", "3080\n", "3100\n", "3120\n", "3140\n", "3160\n", "3180\n", "3200\n", "3220\n", "3240\n", "3260\n", "3280\n", "3300\n", "3320\n", "3340\n", "3360\n", "3380\n", "3400\n", "3420\n", "3440\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3460\n", "3480\n", "3500\n", "3520\n", "3540\n", "3560\n", "3580\n", "3600\n", "3620\n", "3640\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3660\n", "3680\n", "3700\n", "3720\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3740\n", "3760\n", "3780\n", "3800\n", "3820\n", "3840\n", "3860\n", "3880\n", "3900\n", "3920\n", "3940\n", "3960\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3980\n", "4000\n", "4020\n", "4040\n", "4060\n", "4080\n", "4100\n", "4120\n", "4140\n", "4160\n", "4180\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4200\n", "4220\n", "4240\n", "4260\n", "4280\n", "4300\n", "4320\n", "4340\n", "4360\n", "4380\n", "4400\n", "4420\n", "4440\n", "4460\n", "4480\n", "4500\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n", "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4520\n", "4540\n", "4560\n", "4580\n", "4600\n", "4620\n", "4640\n", "4660\n", "4680\n", "4700\n", "4720\n", "4740\n", "4760\n", "4780\n", "4800\n", "4820\n", "4840\n", "4860\n" ] } ], "source": [ "PREFIX_PATH = \"/Users/Rrando/Documents/common_crawl/CC-MAIN-20241102010035-20241102040035-\"\n", "file_list = [\"00000\", \"00002\", \"00003\", \"00004\", \"00005\"]\n", "\n", "for fname in file_list:\n", " r = extract_english_files(PREFIX_PATH + fname + \".warc.gz\")\n", " cc_corpus = pd.DataFrame(r)\n", " cc_corpus.to_csv(f\"../data/external/common_crawl_{fname}.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "78a61fac-b3d7-41f6-8f96-3b5683715451", "metadata": {}, "outputs": [], "source": [ "r = extract_non_english_latin('/Users/Rrando/crawl/out/CC-MAIN-20250218081924-20250218111924-00893.warc.gz')\n", "cc_corpus = pd.DataFrame(r)\n", "cc_corpus.to_csv(\"../data/external/common_crawl_non_english.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c1bc4277-c415-4d36-9406-81d3f9e88209", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }

notebooks/Common Crawl.ipynb (2,805 lines of code) (raw):